import pandas as pd
import numpy as np
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly.express as px
from topicmodel import TopicModel
OHCO = ['book_id','chap_id','para_num','sent_num','token_num']
BOW = pd.read_csv("twain_BOW.csv")
BOW['term_str'] = BOW['term_str'].astype('str')
BOW = BOW.set_index(['book_id', 'chap_id', 'term_str'])
LIB = pd.read_csv(("twain_pre_LIB.csv"), index_col = ['book_id'])
CORPUS = pd.read_csv(("twain_pre_CORPUS.csv"), index_col = OHCO)
VOCAB = pd.read_csv("twain_pre_VOCAB.csv")
VOCAB['term_str'] = VOCAB['term_str'].astype('str')
VOCAB = VOCAB.set_index('term_str')
VOCAB['pos_group'] = VOCAB.max_pos.str.slice(0,2)
CHAPS = CORPUS.groupby(OHCO[:2]+['term_str']).term_str.count().unstack()
VOCAB['df'] = CHAPS.count()
VOCAB['dfidf'] = VOCAB.df * np.log2(len(CHAPS)/VOCAB.df)
VOCAB.head()
| n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | pos_group | df | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||
| 0 | 5 | 1 | 0.000002 | 19.180285 | CD | 1 | {'CD'} | 0 | 0 | 0 | 0 | CD | 2.0 | 18.227484 |
| 00 | 3 | 2 | 0.000001 | 19.917251 | NN | 2 | {'NN', 'NNS'} | 0 | 00 | 00 | 00 | NN | 1.0 | 10.113742 |
| 01 | 3 | 2 | 0.000001 | 19.917251 | NNS | 2 | {'NN', 'NNS'} | 0 | 01 | 01 | 01 | NN | 1.0 | 10.113742 |
| 02 | 4 | 2 | 0.000001 | 19.502213 | NN | 3 | {'POS', 'NN', 'NNP'} | 0 | 02 | 02 | 02 | NN | 2.0 | 18.227484 |
| 03 | 6 | 2 | 0.000002 | 18.917251 | NN | 3 | {'POS', 'NN', 'NNS'} | 0 | 03 | 03 | 03 | NN | 1.0 | 10.113742 |
BOW.head()
| n | tf | tfidf | |||
|---|---|---|---|---|---|
| book_id | chap_id | term_str | |||
| 70 | 1 | 1835 | 1 | 0.142857 | 1.159106 |
| 1910 | 1 | 0.142857 | 1.075540 | ||
| a | 2 | 0.285714 | 0.002238 | ||
| alphabet | 1 | 0.142857 | 0.991974 | ||
| as | 2 | 0.285714 | 0.013615 |
LIB.head()
| source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | |
|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||
| 70 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 |
| 74 | Twain/74-the_adventures_of_tom_sawyer.txt | the adventures of tom sawyer | ^\s*CHAPTER\s*[IVXLCM]+$ | twain | novel | 1876 | 1870 | 35 | 70276 |
| 76 | Twain/76-the_adventures_of_huckleberry_finn.txt | the adventures of huckleberry finn | ^\s*CHAPTER\s*(?:[IVXLCM]+\.|THE LAST)$ | twain | novel | 1884 | 1880 | 43 | 111908 |
| 86 | Twain/86-a_connecticut_yankee_in_king_arthurs_... | a connecticut yankee in king arthurs court | ^\s*(?:PREFACE|A WORD OF EXPLANATION|THE STRAN... | twain | novel | 1889 | 1880 | 47 | 119100 |
| 91 | Twain/91-tom_sawyer_abroad.txt | tom sawyer abroad | CHAPTER\s[IVXLCM]+\. | twain | novel | 1894 | 1890 | 13 | 33969 |
# join BOW and VOCAB
joint_BOW = BOW.reset_index().set_index('term_str').join(VOCAB, rsuffix = "_vocab")
# remove nan
joint_BOW = joint_BOW.loc[~joint_BOW.isna().any(axis = 1)]
# remove proper nouns
joint_BOW = joint_BOW.loc[~joint_BOW.max_pos.isin(['NNP', 'NNPS'])]
joint_BOW
| book_id | chap_id | n | tf | tfidf | n_vocab | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | pos_group | df | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||
| 0 | 3199 | 1 | 2 | 0.008439 | 0.076909 | 5 | 1 | 1.683290e-06 | 19.180285 | CD | 1 | {'CD'} | 0 | 0 | 0 | 0 | CD | 2.0 | 18.227484 |
| 0 | 3251 | 6 | 3 | 0.004587 | 0.041806 | 5 | 1 | 1.683290e-06 | 19.180285 | CD | 1 | {'CD'} | 0 | 0 | 0 | 0 | CD | 2.0 | 18.227484 |
| 00 | 3199 | 24 | 3 | 0.012448 | 0.125897 | 3 | 2 | 1.009974e-06 | 19.917251 | NN | 2 | {'NN', 'NNS'} | 0 | 00 | 00 | 00 | NN | 1.0 | 10.113742 |
| 01 | 3199 | 25 | 3 | 0.013699 | 0.138544 | 3 | 2 | 1.009974e-06 | 19.917251 | NNS | 2 | {'NN', 'NNS'} | 0 | 01 | 01 | 01 | NN | 1.0 | 10.113742 |
| 02 | 3186 | 14 | 1 | 0.005464 | 0.049802 | 4 | 2 | 1.346632e-06 | 19.502213 | NN | 3 | {'POS', 'NN', 'NNP'} | 0 | 02 | 02 | 02 | NN | 2.0 | 18.227484 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| étouffante | 60900 | 5 | 1 | 0.007752 | 0.078401 | 1 | 10 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | étouffant | étouffant | étouff | NN | 1.0 | 10.113742 |
| évitant | 3189 | 3 | 1 | 0.004132 | 0.041792 | 1 | 7 | 3.366579e-07 | 21.502213 | VBP | 1 | {'VBP'} | 0 | évitant | évitant | évit | VB | 1.0 | 10.113742 |
| êtes | 3189 | 3 | 1 | 0.004132 | 0.041792 | 1 | 4 | 3.366579e-07 | 21.502213 | NNS | 1 | {'NNS'} | 0 | ête | êtes | ête | NN | 1.0 | 10.113742 |
| öffnen | 60900 | 6 | 1 | 0.004608 | 0.046607 | 1 | 6 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | öffnen | öffnen | öffnen | NN | 1.0 | 10.113742 |
| übergeschlagen | 60900 | 6 | 1 | 0.004608 | 0.046607 | 1 | 14 | 3.366579e-07 | 21.502213 | NN | 1 | {'NN'} | 0 | übergeschlagen | übergeschlagen | übergeschl | NN | 1.0 | 10.113742 |
877057 rows × 19 columns
# recover filtered BOW --> drop cols added by VOCAB and reset index to book_id, chap_id, term_str
filtered_BOW = joint_BOW.drop(joint_BOW.loc[:, 'n_vocab':].columns, axis = 1).reset_index().set_index(['book_id', 'chap_id', 'term_str'])
# sort by book id
filtered_BOW = filtered_BOW.sort_values('book_id')
filtered_BOW
| n | tf | tfidf | |||
|---|---|---|---|---|---|
| book_id | chap_id | term_str | |||
| 70 | 10 | read | 3 | 0.014423 | 0.019551 |
| stock | 1 | 0.004808 | 0.013125 | ||
| 16 | stock | 1 | 0.010989 | 0.030000 | |
| 17 | stock | 2 | 0.001498 | 0.004090 | |
| 2 | inert | 1 | 0.000732 | 0.005080 | |
| ... | ... | ... | ... | ... | ... |
| 62739 | 4 | two | 5 | 0.017668 | 0.003841 |
| 5 | two | 4 | 0.038095 | 0.008282 | |
| 4 | most | 3 | 0.010601 | 0.004905 | |
| 2 | everything | 2 | 0.005556 | 0.006925 | |
| officials | 1 | 0.002778 | 0.012337 |
877057 rows × 3 columns
# removed ~ 5% of data when taking out proper nouns (singular and plural)
(BOW.shape[0] - filtered_BOW.shape[0]) / BOW.shape[0]
0.05007110465109982
n_topics = 40
n_terms = 2000
tm = TopicModel(filtered_BOW)
tm.n_topics = n_topics
tm.n_terms = n_terms
tm.create_X()
tm.get_model()
tm.describe_topics()
tm.get_model_stats()
tm.plot_topics()
# table with distribution of topics for each doc
tm.THETA
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||||
| 70 | 1 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.678106 | 0.002273 | 0.002273 | 0.002273 | ... | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 | 0.002273 |
| 2 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | ... | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.000005 | 0.999816 | 0.000005 | 0.000005 | |
| 3 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.010489 | 0.000036 | 0.000036 | 0.000036 | 0.009727 | 0.088655 | ... | 0.112232 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.000036 | 0.057427 | |
| 4 | 0.000038 | 0.000038 | 0.000038 | 0.000038 | 0.000038 | 0.000038 | 0.435486 | 0.000038 | 0.000038 | 0.000038 | ... | 0.043335 | 0.000038 | 0.000038 | 0.000038 | 0.000038 | 0.000038 | 0.107206 | 0.000038 | 0.021042 | 0.000038 | |
| 5 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.105762 | 0.000028 | 0.000028 | 0.314828 | ... | 0.019001 | 0.000028 | 0.174355 | 0.000028 | 0.000028 | 0.000028 | 0.015326 | 0.000028 | 0.000028 | 0.000028 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 2 | 0.000023 | 0.073183 | 0.036687 | 0.000023 | 0.005265 | 0.000023 | 0.379604 | 0.000023 | 0.059696 | 0.000023 | ... | 0.109367 | 0.000023 | 0.000023 | 0.000023 | 0.000023 | 0.000023 | 0.102332 | 0.000023 | 0.030935 | 0.027682 |
| 3 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.411287 | 0.000227 | 0.165402 | 0.000227 | ... | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | 0.000227 | |
| 4 | 0.000054 | 0.050824 | 0.000054 | 0.000054 | 0.025275 | 0.000054 | 0.653492 | 0.000054 | 0.000054 | 0.037204 | ... | 0.000054 | 0.000054 | 0.000054 | 0.000054 | 0.000054 | 0.037396 | 0.000054 | 0.000054 | 0.000054 | 0.000054 | |
| 5 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.692254 | 0.000144 | 0.000144 | 0.000144 | ... | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | 0.000144 | |
| 6 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.513626 | 0.000581 | 0.000581 | 0.000581 | ... | 0.000581 | 0.000581 | 0.068624 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.000581 | 0.000581 |
1108 rows × 40 columns
# distrubution of words over topics
tm.PHI
| term_str | german | ancient | allowed | art | thou | private | month | mile | om | curious | ... | sons | bag | insane | fearful | motion | debt | council | greater | features | frank |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| topic_id | |||||||||||||||||||||
| 0 | 8.025308 | 0.025000 | 0.025000 | 7.762330 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 7.107562 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.697142 | 0.025000 |
| 1 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 0.025000 | 5.726570 | 0.025000 | 2.914376 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 2 | 0.025000 | 0.025000 | 27.847490 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 139.687435 | 0.025 | 11.561690 | ... | 0.025000 | 4.583850 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 3 | 0.025000 | 0.025000 | 0.025000 | 3.552965 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 16.400235 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 4 | 0.025000 | 0.025000 | 3.561333 | 0.859023 | 0.025000 | 0.025000 | 0.725470 | 0.025000 | 0.025 | 4.394222 | ... | 1.419074 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.756410 | 0.025000 |
| 5 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 7.134957 | 5.021558 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 2.153632 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 6 | 1.606724 | 58.063440 | 78.634398 | 65.521863 | 0.025000 | 60.510445 | 25.791504 | 15.281461 | 0.025 | 83.738623 | ... | 24.707497 | 2.054964 | 57.168346 | 0.025000 | 0.025000 | 16.025999 | 8.829800 | 28.392960 | 8.589827 | 29.077507 |
| 7 | 0.025000 | 1.403531 | 0.025000 | 0.025000 | 0.025000 | 2.251557 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.429973 | 0.025000 | 0.025000 | 0.025000 |
| 8 | 0.025000 | 15.812970 | 3.202335 | 5.097929 | 3.700189 | 12.573707 | 37.355050 | 0.025000 | 0.025 | 12.738382 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 3.341822 | 0.025000 | 0.025000 | 14.447153 | 0.025231 | 1.890293 |
| 9 | 142.250248 | 52.863735 | 40.318434 | 109.878943 | 0.025000 | 41.759558 | 28.939902 | 0.025000 | 0.025 | 35.785724 | ... | 0.025000 | 0.025000 | 0.025000 | 3.924673 | 0.025000 | 1.363719 | 4.757888 | 4.888110 | 26.482373 | 0.025000 |
| 10 | 14.314383 | 23.783597 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 5.974309 | 162.025042 | 0.025 | 23.526947 | ... | 0.025000 | 9.910046 | 0.025000 | 36.847709 | 19.076858 | 0.025000 | 13.934602 | 15.522342 | 21.093618 | 0.025000 |
| 11 | 0.025000 | 6.575866 | 13.534899 | 11.425903 | 2.618971 | 42.418284 | 10.252781 | 0.025000 | 0.025 | 13.772151 | ... | 0.025000 | 0.025000 | 2.672713 | 13.398626 | 9.167290 | 15.281968 | 0.025000 | 0.025000 | 0.025000 | 2.979413 |
| 12 | 0.025000 | 26.779278 | 0.025000 | 57.983577 | 325.409821 | 5.168668 | 0.025000 | 0.025000 | 0.025 | 2.064034 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 11.939993 | 6.189663 | 5.101878 | 1.234023 |
| 13 | 39.439127 | 0.025000 | 0.025000 | 1.055050 | 1.027437 | 2.131427 | 0.025000 | 0.025000 | 0.025 | 1.066924 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 14 | 0.025000 | 0.489465 | 4.510259 | 9.014678 | 0.025000 | 0.025000 | 23.411455 | 0.025000 | 0.025 | 0.425207 | ... | 0.025000 | 0.257593 | 0.025000 | 0.025000 | 3.518006 | 0.025000 | 0.025000 | 0.025000 | 1.884590 | 0.025000 |
| 15 | 23.423745 | 3.189324 | 25.701895 | 3.291962 | 0.025000 | 41.679256 | 107.879904 | 0.025000 | 0.025 | 13.282712 | ... | 0.025000 | 5.538702 | 0.025000 | 0.025000 | 0.025000 | 26.295302 | 0.025000 | 16.712337 | 0.025000 | 11.237336 |
| 16 | 0.025000 | 0.025000 | 0.037356 | 0.025000 | 0.025000 | 0.962268 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 0.025000 | 1.149581 | 0.025000 | 0.025000 | 0.025000 | 1.081555 | 2.070345 | 0.025000 | 2.056835 | 0.025000 |
| 17 | 0.025000 | 5.672838 | 13.243987 | 57.442185 | 0.025000 | 9.417616 | 0.025000 | 0.025000 | 0.025 | 6.186159 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 3.531460 | 6.819559 |
| 18 | 10.479892 | 7.029229 | 8.349322 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 3.039370 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.981913 | 0.025000 | 15.729645 |
| 19 | 0.025000 | 0.025000 | 3.775352 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 2.264966 | 0.025000 | 31.397044 | 5.241473 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 20 | 4.107050 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 6.833952 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 21 | 0.025000 | 0.540559 | 0.025000 | 13.926766 | 0.025000 | 0.096237 | 0.025000 | 1.830500 | 0.025 | 14.913016 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.971335 | 0.025000 | 0.025000 | 2.103139 |
| 22 | 10.543879 | 4.634565 | 1.686472 | 1.183793 | 0.025000 | 16.212240 | 0.557923 | 19.812351 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 6.643565 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 23 | 0.025000 | 1.297191 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 6.626895 | ... | 0.025000 | 2.325532 | 0.025000 | 1.469624 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 24 | 11.474755 | 4.285487 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 0.025000 | ... | 0.025000 | 2.558256 | 0.025000 | 0.025000 | 3.232359 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 25 | 6.360447 | 0.025000 | 1.401918 | 0.025000 | 0.025000 | 30.328421 | 0.025000 | 7.102952 | 0.025 | 20.782576 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 6.614393 | 0.025000 | 0.025000 | 0.025000 | 5.502378 | 4.087546 |
| 26 | 0.025000 | 6.716472 | 19.627906 | 7.721242 | 0.025000 | 58.224471 | 0.025000 | 2.862189 | 0.025 | 0.026588 | ... | 0.025000 | 0.025000 | 6.063031 | 2.768398 | 3.257176 | 9.732187 | 0.025000 | 0.237005 | 12.572618 | 9.881496 |
| 27 | 0.025000 | 8.012943 | 13.513383 | 41.480125 | 65.362325 | 0.025000 | 0.025000 | 0.025000 | 0.025 | 6.248925 | ... | 2.248895 | 0.025000 | 0.025000 | 0.025000 | 2.710533 | 0.025000 | 0.025000 | 3.084044 | 1.637266 | 0.025000 |
| 28 | 0.025000 | 0.025000 | 47.382806 | 0.025000 | 0.025000 | 18.850184 | 0.025000 | 13.993898 | 0.025 | 9.592356 | ... | 1.906721 | 54.453012 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.221147 |
| 29 | 0.025000 | 8.749385 | 21.605058 | 0.025000 | 0.025000 | 8.264212 | 64.480949 | 14.336920 | 0.025 | 0.025000 | ... | 3.228156 | 1.784039 | 0.025000 | 0.206626 | 0.025000 | 18.025589 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 30 | 0.025000 | 0.887764 | 12.717148 | 0.025000 | 0.025000 | 9.045312 | 1.674156 | 0.534488 | 0.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 58.266064 | 9.078557 | 0.025000 | 11.708979 |
| 31 | 132.133853 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 10.678900 | 0.899617 | 0.025000 | 0.025 | 10.279524 | ... | 0.025000 | 0.861629 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 2.740916 | 0.025000 | 5.318947 |
| 32 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 10.873960 | 12.503969 | 0.025000 | 0.025 | 16.425912 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 4.070197 | 0.025000 |
| 33 | 0.025000 | 0.025000 | 0.025000 | 7.498789 | 0.025000 | 0.025000 | 7.400956 | 0.025000 | 0.025 | 14.873121 | ... | 13.933161 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 34 | 0.025000 | 10.432333 | 0.025000 | 2.889008 | 0.025000 | 0.025000 | 0.025000 | 4.646002 | 0.025 | 1.526578 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 35 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 10.402455 | 2.505819 | 0.025 | 0.025000 | ... | 0.025000 | 11.143352 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 36 | 0.025000 | 4.499162 | 61.774934 | 0.025000 | 0.025000 | 7.718947 | 19.768136 | 0.025000 | 0.025 | 42.497607 | ... | 23.880294 | 0.025000 | 5.823866 | 18.348091 | 31.583421 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 0.060970 |
| 37 | 0.025000 | 1.025000 | 0.025000 | 0.025000 | 0.025000 | 2.025000 | 0.025000 | 1.025000 | 404.025 | 0.025000 | ... | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 1.025000 | 1.025000 | 0.025000 | 0.025000 | 1.025000 | 0.025000 |
| 38 | 3.165588 | 4.399512 | 0.025000 | 0.025000 | 0.025000 | 19.640129 | 46.296508 | 9.127000 | 0.025 | 7.811724 | ... | 0.025000 | 1.002873 | 0.025000 | 0.025000 | 3.372907 | 12.265049 | 0.025000 | 0.025000 | 0.025000 | 0.025000 |
| 39 | 0.025000 | 150.456353 | 4.098316 | 2.888871 | 12.031257 | 0.744201 | 0.025000 | 8.593013 | 0.025 | 37.668450 | ... | 22.553674 | 0.025000 | 0.025000 | 11.511840 | 0.025000 | 0.025000 | 0.025000 | 0.025000 | 6.373177 | 0.025000 |
40 rows × 2000 columns
tm.TOPIC.sort_values('theta_sum', ascending = False)
| phi_sum | theta_sum | h | top_terms_rel | top_terms | label | |
|---|---|---|---|---|---|---|
| topic_id | ||||||
| 36 | 56674.817260 | 127.577716 | 10.13 | awake stillness cheer whispered sprang wound c... | boys sat voice answer followed fire tried brok... | 36: awake stillness cheer whispered sprang wou... |
| 6 | 44859.419532 | 88.110400 | 9.92 | science reverence civilization government nati... | government history law race state nation human... | 6: science reverence civilization government n... |
| 10 | 40207.637360 | 84.184373 | 9.90 | glacier steep summit rope scenery ice gloom mo... | distance mountain foot deep ground behind sun ... | 10: glacier steep summit rope scenery ice gloo... |
| 11 | 30476.677087 | 59.764951 | 9.89 | earl sack song loving glanced confess gratitud... | father happy child herself sat voice wife stra... | 11: earl sack song loving glanced confess grat... |
| 28 | 19540.766202 | 51.483016 | 9.05 | hes youre theyre nigger haint reckon coffin th... | hes reckon theres nigger youre wont duke youll... | 28: hes youre theyre nigger haint reckon coffi... |
| 9 | 23108.804196 | 49.600283 | 9.86 | corps students student castle italian pictures... | pictures picture table german castle art fine ... | 9: corps students student castle italian pictu... |
| 30 | 16947.303924 | 43.172027 | 9.28 | knights council sword wounded soldiers army so... | war battle army child march sent herself frenc... | 30: knights council sword wounded soldiers arm... |
| 26 | 19734.061371 | 41.714986 | 9.67 | authors sincerely writer guest society hall re... | perhaps father wrote books society hall suppos... | 26: authors sincerely writer guest society hal... |
| 2 | 16895.707949 | 40.930780 | 9.07 | raft canoe reckoned cave warnt scared knowed b... | warnt raft big boys mile begun run reckon minute | 2: raft canoe reckoned cave warnt scared knowe... |
| 15 | 30783.182917 | 39.717141 | 9.62 | letters lecture 12 yours 7 10 9 magazine 3 | letters write wrote written send story yours w... | 15: letters lecture 12 yours 7 10 9 magazine 3 |
| 21 | 15361.746982 | 34.968000 | 9.23 | bird wings brush birds tree shape hasnt begins... | tree black bird makes comes goes big heaven looks | 21: bird wings brush birds tree shape hasnt be... |
| 8 | 13636.500040 | 34.248539 | 9.43 | husband baby married poetry marry wife disease... | wife child chapter husband friend married doct... | 8: husband baby married poetry marry wife dise... |
| 39 | 16583.089258 | 33.596436 | 9.14 | pilgrims temple marble priests stone centuries... | stone church ancient marble walls pilgrims bui... | 39: pilgrims temple marble priests stone centu... |
| 38 | 12089.462258 | 30.269851 | 9.18 | mining mines gold mill silver sold coal sell p... | gold silver rich worth sold mine mining mines ... | 38: mining mines gold mill silver sold coal se... |
| 25 | 11914.347299 | 29.169184 | 9.12 | car conductor train seats railway railroad lad... | train car hotel lady ladies gentlemen public c... | 25: car conductor train seats railway railroad... |
| 29 | 11689.267902 | 28.193424 | 9.07 | pilots pilot wages clerk association wheel cen... | pilot pay cent pilots boat wages bank clerk buy | 29: pilots pilot wages clerk association wheel... |
| 32 | 11099.074186 | 26.910419 | 9.24 | editor minister police journal prison paper pa... | paper public editor school write office papers... | 32: editor minister police journal prison pape... |
| 12 | 11305.222550 | 26.289940 | 9.13 | hath tis thou thy thee prince lad royal mad | thou thy thee prince hath none tis ye royal | 12: hath tis thou thy thee prince lad royal mad |
| 14 | 12234.126069 | 25.794035 | 8.62 | vessel ships ship captain mate deck port passe... | ship captain sea boat island deck ships island... | 14: vessel ships ship captain mate deck port p... |
| 22 | 11200.302142 | 24.080291 | 9.41 | stove fired gun negro killed guns armed shot m... | killed shot kill war horse stove officer box road | 22: stove fired gun negro killed guns armed sh... |
| 31 | 6793.966237 | 17.230533 | 8.87 | vote committee bill german fourth measure coll... | bill german vote speech committee language nob... | 31: vote committee bill german fourth measure ... |
| 27 | 13266.099095 | 16.544906 | 9.58 | thy hero madam stars heaven victory hopes pare... | thy heaven father thee voice soul alone woman ... | 27: thy hero madam stars heaven victory hopes ... |
| 24 | 4336.777031 | 14.208223 | 8.56 | saddle horse game played journal funny morals ... | horse game played saddle memory dog stage reme... | 24: saddle horse game played journal funny mor... |
| 19 | 5515.768416 | 12.868162 | 7.65 | jury trial witnesses verdict guilty judge pris... | judge court jury trial law evidence prisoner m... | 19: jury trial witnesses verdict guilty judge ... |
| 0 | 4497.493923 | 12.664865 | 8.79 | lie shes twins madam taught lying finger baby ... | lie father child truth shes twins son school p... | 0: lie shes twins madam taught lying finger ba... |
| 17 | 4065.521650 | 12.361717 | 8.34 | french marks literature reply printed funny te... | french american literature art article convers... | 17: french marks literature reply printed funn... |
| 20 | 2543.415053 | 11.481418 | 8.48 | tobacco eat smoking smoke pause pipe ass drink... | eat smoke tobacco pause cat royal smoking pipe... | 20: tobacco eat smoking smoke pause pipe ass d... |
| 1 | 3976.042408 | 11.110674 | 8.35 | natives native monument british indian populat... | native natives women indian monument british p... | 1: natives native monument british indian popu... |
| 5 | 4426.796219 | 10.822974 | 7.30 | dey dat yo en den git bout er em | en dat dey den yo git nigger em bout | 5: dey dat yo en den git bout er em |
| 35 | 3342.756757 | 9.713668 | 8.34 | uncle murdered corpse cabin murder lantern pri... | uncle brother cabin murder kill boys murdered ... | 35: uncle murdered corpse cabin murder lantern... |
| 4 | 3532.911084 | 9.562080 | 8.16 | driver lake conductor mountains station mail s... | lake driver mountains stage station desert sno... | 4: driver lake conductor mountains station mai... |
| 18 | 3592.375037 | 8.454726 | 8.35 | papa mamma cats frank cure funny acquired bath... | papa cats mamma remember lady cat prince table... | 18: papa mamma cats frank cure funny acquired ... |
| 7 | 1770.259201 | 8.391929 | 8.57 | elephant dream telegram dreams angels suspecte... | elephant dream dreams office telegram arrived ... | 7: elephant dream telegram dreams angels suspe... |
| 33 | 2631.197347 | 8.253225 | 8.29 | unto knight tale behold page fighting fought s... | unto tale knight story pass page women seven hair | 33: unto knight tale behold page fighting foug... |
| 16 | 2294.407743 | 6.795957 | 7.76 | ye mob kingdom adventures confess interrupted ... | ye boys mob tree bad books master fair school | 16: ye mob kingdom adventures confess interrup... |
| 3 | 2569.318196 | 4.918893 | 8.09 | frog chinese et bull bet individual motion cit... | frog chinese et bull bet citizens article floo... | 3: frog chinese et bull bet individual motion ... |
| 34 | 1544.790390 | 4.890485 | 8.43 | excursion passengers voyage board steamer pray... | board excursion passengers reached visit voyag... | 34: excursion passengers voyage board steamer ... |
| 23 | 1665.885870 | 4.600938 | 7.57 | 1 knife 2 4 finger 3 5 palace notes | 1 2 knife 4 3 girl grand finger letters | 23: 1 knife 2 4 finger 3 5 palace notes |
| 13 | 1621.287051 | 1.970443 | 8.23 | ich cloth sheep pages aside er enter volume ge... | ich cloth die german aside pages sheep girls e... | 13: ich cloth sheep pages aside er enter volum... |
| 37 | 5405.414808 | 1.378391 | 8.87 | om ym training impulse machine content instinc... | om ym outside training mans machine self spiri... | 37: om ym training impulse machine content ins... |
top_topic = tm.TOPIC.theta_sum.idxmax()
top_topic
36
tm.TOPIC.sort_values('theta_sum', ascending = False).loc[top_topic, 'top_terms_rel']
'awake stillness cheer whispered sprang wound crept hurried resolved'
# find topic (theta) that is most frequent (highest total prob across all docs) and take that topics top 5 terms
top_five_terms = tm.TOPIC.sort_values('theta_sum', ascending = False).loc[top_topic, 'top_terms_rel'].split()[:5]
top_five_terms
['awake', 'stillness', 'cheer', 'whispered', 'sprang']
# join THETA and LIB tables
joint_theta = tm.THETA.join(LIB)
# add title column to index
joint_theta = joint_theta.set_index('title', append = True)
# drop other LIB cols and get mean topic distribution for each book
book_mean_theta = joint_theta.drop(joint_theta.loc[:, 'year':].columns, axis = 1).groupby(['book_id', 'title', 'type']).mean()
book_mean_theta.style.background_gradient(axis=None)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | title | type | ||||||||||||||||||||||||||||||||||||||||
| 70 | what is man | non-fiction | 0.015837 | 0.000207 | 0.024724 | 0.000207 | 0.006011 | 0.000207 | 0.239277 | 0.000207 | 0.018703 | 0.079202 | 0.033767 | 0.030891 | 0.004796 | 0.010523 | 0.005029 | 0.064338 | 0.000365 | 0.020165 | 0.000588 | 0.013574 | 0.051514 | 0.062734 | 0.008459 | 0.000892 | 0.006019 | 0.017992 | 0.046531 | 0.005332 | 0.002681 | 0.004409 | 0.017280 | 0.001014 | 0.039933 | 0.000207 | 0.000207 | 0.000207 | 0.067864 | 0.059020 | 0.002289 | 0.036799 |
| 74 | the adventures of tom sawyer | novel | 0.014999 | 0.000651 | 0.130010 | 0.001421 | 0.000094 | 0.000426 | 0.006366 | 0.010125 | 0.002706 | 0.013091 | 0.068964 | 0.069881 | 0.003699 | 0.000094 | 0.005028 | 0.000734 | 0.000340 | 0.000094 | 0.003651 | 0.014059 | 0.001477 | 0.031417 | 0.009578 | 0.001900 | 0.002907 | 0.000986 | 0.001002 | 0.007635 | 0.108713 | 0.005429 | 0.003795 | 0.000094 | 0.008173 | 0.000559 | 0.034318 | 0.010806 | 0.400890 | 0.000094 | 0.015084 | 0.008713 |
| 76 | the adventures of huckleberry finn | novel | 0.007904 | 0.000073 | 0.370789 | 0.000357 | 0.000398 | 0.074444 | 0.001506 | 0.000375 | 0.001639 | 0.006232 | 0.007067 | 0.014830 | 0.001845 | 0.000167 | 0.003492 | 0.003834 | 0.002598 | 0.001150 | 0.002302 | 0.003859 | 0.001551 | 0.006208 | 0.003383 | 0.000658 | 0.004467 | 0.000073 | 0.000073 | 0.000073 | 0.441806 | 0.008614 | 0.005177 | 0.000581 | 0.002425 | 0.008447 | 0.000255 | 0.003089 | 0.001281 | 0.000073 | 0.001949 | 0.004957 |
| 86 | a connecticut yankee in king arthurs court | novel | 0.000081 | 0.000668 | 0.026693 | 0.000808 | 0.003958 | 0.000081 | 0.129530 | 0.002151 | 0.014056 | 0.097000 | 0.053231 | 0.048143 | 0.069672 | 0.000456 | 0.003461 | 0.009312 | 0.067076 | 0.003821 | 0.001028 | 0.003223 | 0.003280 | 0.038775 | 0.008887 | 0.001648 | 0.009659 | 0.001817 | 0.013423 | 0.000081 | 0.017457 | 0.027142 | 0.072048 | 0.000373 | 0.006023 | 0.042967 | 0.001395 | 0.006212 | 0.184452 | 0.001443 | 0.008979 | 0.019488 |
| 91 | tom sawyer abroad | novel | 0.000068 | 0.001430 | 0.546590 | 0.000437 | 0.014668 | 0.092809 | 0.009782 | 0.000068 | 0.000068 | 0.000068 | 0.016468 | 0.000068 | 0.000068 | 0.000068 | 0.011637 | 0.010432 | 0.001546 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.113325 | 0.003570 | 0.000068 | 0.002783 | 0.000068 | 0.000068 | 0.000068 | 0.103748 | 0.000068 | 0.012941 | 0.000068 | 0.000835 | 0.011596 | 0.000068 | 0.001969 | 0.008363 | 0.000068 | 0.012569 | 0.021216 |
| 93 | tom sawyer detective | novel | 0.003444 | 0.000096 | 0.272056 | 0.000096 | 0.000096 | 0.000096 | 0.000096 | 0.000096 | 0.000096 | 0.000096 | 0.002470 | 0.032567 | 0.000096 | 0.000096 | 0.009728 | 0.004810 | 0.000096 | 0.000096 | 0.000096 | 0.004442 | 0.000096 | 0.058044 | 0.000096 | 0.000096 | 0.000096 | 0.005156 | 0.000096 | 0.000096 | 0.422689 | 0.009740 | 0.007745 | 0.000096 | 0.000096 | 0.000096 | 0.000096 | 0.143436 | 0.021175 | 0.000096 | 0.000096 | 0.000096 |
| 102 | the tragedy of puddnhead wilson | novel | 0.033287 | 0.000502 | 0.000084 | 0.000084 | 0.000716 | 0.162329 | 0.038289 | 0.000084 | 0.004336 | 0.027687 | 0.013920 | 0.103639 | 0.000084 | 0.000084 | 0.001621 | 0.002837 | 0.000482 | 0.001602 | 0.004153 | 0.061549 | 0.004477 | 0.008278 | 0.008953 | 0.014967 | 0.001359 | 0.010127 | 0.026214 | 0.009691 | 0.052483 | 0.015002 | 0.002400 | 0.002401 | 0.011116 | 0.011090 | 0.007182 | 0.016994 | 0.315906 | 0.000084 | 0.012644 | 0.011264 |
| 119 | a tramp abroad | non-fiction | 0.025215 | 0.001831 | 0.007867 | 0.001248 | 0.005697 | 0.001029 | 0.023348 | 0.000234 | 0.021466 | 0.203728 | 0.301498 | 0.043295 | 0.005434 | 0.001289 | 0.012315 | 0.013832 | 0.000346 | 0.017167 | 0.004428 | 0.002411 | 0.012670 | 0.026230 | 0.011972 | 0.005762 | 0.012617 | 0.037846 | 0.009671 | 0.008961 | 0.022205 | 0.010938 | 0.010925 | 0.024770 | 0.012023 | 0.004354 | 0.000345 | 0.000056 | 0.072252 | 0.000056 | 0.007009 | 0.015659 |
| 142 | the 30000 bequest and other stories | stories | 0.047651 | 0.021243 | 0.003721 | 0.002725 | 0.001459 | 0.000107 | 0.103514 | 0.000676 | 0.089541 | 0.033927 | 0.004709 | 0.081852 | 0.011007 | 0.000107 | 0.008205 | 0.060444 | 0.001908 | 0.014787 | 0.000224 | 0.000878 | 0.049853 | 0.071574 | 0.022724 | 0.002042 | 0.013402 | 0.007691 | 0.038241 | 0.087527 | 0.007142 | 0.036881 | 0.008319 | 0.027735 | 0.044009 | 0.000107 | 0.002158 | 0.007235 | 0.057153 | 0.000107 | 0.015471 | 0.011947 |
| 245 | life on the mississippi | non-fiction | 0.004254 | 0.008460 | 0.041460 | 0.000363 | 0.023363 | 0.000891 | 0.105129 | 0.000141 | 0.014288 | 0.029767 | 0.179551 | 0.016562 | 0.000097 | 0.000985 | 0.076012 | 0.014626 | 0.000602 | 0.010438 | 0.000317 | 0.002610 | 0.003499 | 0.038504 | 0.023839 | 0.001645 | 0.002879 | 0.023811 | 0.015957 | 0.011354 | 0.015325 | 0.132180 | 0.009720 | 0.005537 | 0.013748 | 0.002040 | 0.001074 | 0.010825 | 0.112939 | 0.000097 | 0.024165 | 0.020947 |
| 1044 | extract from captain stormfields visit to Heaven | stories | 0.000021 | 0.000021 | 0.082854 | 0.000021 | 0.000021 | 0.000021 | 0.078688 | 0.000021 | 0.010375 | 0.026730 | 0.000021 | 0.049331 | 0.013223 | 0.000021 | 0.021529 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.446756 | 0.000490 | 0.004104 | 0.010595 | 0.000021 | 0.000021 | 0.030216 | 0.145034 | 0.035694 | 0.018691 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.000021 | 0.025186 | 0.000021 | 0.000021 | 0.000021 |
| 1086 | a horses tale | novel | 0.158913 | 0.002295 | 0.007775 | 0.016298 | 0.018987 | 0.001130 | 0.037573 | 0.000330 | 0.012008 | 0.013947 | 0.014900 | 0.084270 | 0.000330 | 0.000330 | 0.000330 | 0.005307 | 0.000330 | 0.002160 | 0.001310 | 0.004285 | 0.014483 | 0.182807 | 0.016884 | 0.000330 | 0.024845 | 0.000330 | 0.000330 | 0.000330 | 0.099170 | 0.017132 | 0.182116 | 0.038832 | 0.000330 | 0.000330 | 0.000330 | 0.004339 | 0.030512 | 0.000330 | 0.003103 | 0.000330 |
| 1837 | the prince and the pauper | novel | 0.000099 | 0.001300 | 0.000099 | 0.000814 | 0.000872 | 0.000099 | 0.016959 | 0.030296 | 0.011970 | 0.030248 | 0.048097 | 0.061700 | 0.492088 | 0.001224 | 0.000692 | 0.005006 | 0.006650 | 0.000099 | 0.000099 | 0.010654 | 0.002343 | 0.003552 | 0.005884 | 0.001540 | 0.000099 | 0.011548 | 0.001976 | 0.007056 | 0.000099 | 0.000099 | 0.034092 | 0.000099 | 0.005726 | 0.002169 | 0.000099 | 0.000486 | 0.189520 | 0.000099 | 0.003370 | 0.011086 |
| 2874 | personal recollections of joan of arc vol 1 | non-fiction | 0.013243 | 0.002239 | 0.004701 | 0.000090 | 0.000090 | 0.000090 | 0.028672 | 0.000090 | 0.008384 | 0.029070 | 0.022709 | 0.097883 | 0.021119 | 0.000090 | 0.004654 | 0.000090 | 0.001756 | 0.004148 | 0.000090 | 0.006553 | 0.005956 | 0.026663 | 0.018206 | 0.000090 | 0.002021 | 0.003316 | 0.014813 | 0.020061 | 0.014491 | 0.001122 | 0.355081 | 0.000306 | 0.000090 | 0.000535 | 0.000090 | 0.003278 | 0.259538 | 0.000090 | 0.002259 | 0.026225 |
| 2875 | personal recollections of joan of arc vol 2 | non-fiction | 0.002437 | 0.026268 | 0.000100 | 0.001516 | 0.000253 | 0.000281 | 0.049335 | 0.026273 | 0.001762 | 0.006827 | 0.031332 | 0.039127 | 0.004894 | 0.000100 | 0.001345 | 0.001390 | 0.003004 | 0.009282 | 0.000588 | 0.028616 | 0.000100 | 0.015282 | 0.001908 | 0.003857 | 0.000100 | 0.004514 | 0.006064 | 0.020025 | 0.000100 | 0.001653 | 0.365788 | 0.001071 | 0.007019 | 0.001233 | 0.000100 | 0.001239 | 0.312656 | 0.000100 | 0.005283 | 0.017180 |
| 2895 | following the equator | non-fiction | 0.007173 | 0.054910 | 0.003186 | 0.000760 | 0.012864 | 0.000062 | 0.189976 | 0.004922 | 0.014010 | 0.067183 | 0.081159 | 0.027814 | 0.005701 | 0.000834 | 0.055266 | 0.020616 | 0.002022 | 0.006818 | 0.020113 | 0.002673 | 0.005792 | 0.099055 | 0.029851 | 0.002857 | 0.009529 | 0.073620 | 0.012798 | 0.002906 | 0.008320 | 0.010434 | 0.017510 | 0.000890 | 0.011859 | 0.003350 | 0.002020 | 0.002470 | 0.044378 | 0.000062 | 0.040715 | 0.043523 |
| 3171 | in defense of harriet shelley | non-fiction | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.001713 | 0.000029 | 0.033292 | 0.000029 | 0.647061 | 0.033897 | 0.000029 | 0.034673 | 0.032994 | 0.000029 | 0.000029 | 0.034486 | 0.000029 | 0.000029 | 0.001602 | 0.031420 | 0.000029 | 0.006183 | 0.000029 | 0.013426 | 0.000029 | 0.000029 | 0.025839 | 0.067203 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.035467 | 0.000029 | 0.000029 | 0.000029 |
| 3172 | fenimore coopers literary offences | non-fiction | 0.000028 | 0.000028 | 0.101781 | 0.000028 | 0.016329 | 0.000028 | 0.287167 | 0.000028 | 0.000028 | 0.057777 | 0.119758 | 0.000028 | 0.009781 | 0.000028 | 0.022794 | 0.074101 | 0.000028 | 0.079442 | 0.000028 | 0.000028 | 0.000028 | 0.076808 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.028272 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.125200 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 | 0.000028 |
| 3173 | essays on paul bourget | non-fiction | 0.000029 | 0.020714 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.412768 | 0.000029 | 0.000029 | 0.016498 | 0.000029 | 0.030440 | 0.000029 | 0.000029 | 0.000029 | 0.051022 | 0.000029 | 0.373072 | 0.000029 | 0.000029 | 0.000029 | 0.026649 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.045046 | 0.000029 | 0.000029 | 0.003425 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.000029 | 0.019507 | 0.000029 |
| 3176 | the innocents abroad | non-fiction | 0.002929 | 0.001412 | 0.001813 | 0.001334 | 0.007316 | 0.000137 | 0.024747 | 0.001621 | 0.018187 | 0.129649 | 0.147441 | 0.019737 | 0.011071 | 0.000364 | 0.072933 | 0.007349 | 0.001804 | 0.005562 | 0.001442 | 0.002206 | 0.003631 | 0.014165 | 0.012209 | 0.002353 | 0.018413 | 0.019914 | 0.020843 | 0.013620 | 0.006949 | 0.009380 | 0.012001 | 0.005188 | 0.019012 | 0.003378 | 0.029989 | 0.000647 | 0.053347 | 0.000057 | 0.024849 | 0.270999 |
| 3177 | roughing it | novel | 0.000554 | 0.010776 | 0.020091 | 0.003813 | 0.057298 | 0.000470 | 0.039634 | 0.011184 | 0.013990 | 0.015569 | 0.214576 | 0.030030 | 0.004682 | 0.000533 | 0.036372 | 0.004825 | 0.000726 | 0.000677 | 0.003717 | 0.013029 | 0.007988 | 0.029620 | 0.048271 | 0.000746 | 0.032054 | 0.009474 | 0.004497 | 0.005090 | 0.017715 | 0.011668 | 0.008359 | 0.003027 | 0.050774 | 0.008370 | 0.002238 | 0.003251 | 0.085055 | 0.000076 | 0.160074 | 0.029107 |
| 3178 | the gilded age | novel | 0.002170 | 0.024885 | 0.007543 | 0.000865 | 0.002244 | 0.005937 | 0.022617 | 0.000889 | 0.029747 | 0.010523 | 0.047455 | 0.077516 | 0.004726 | 0.000082 | 0.006081 | 0.012670 | 0.000488 | 0.000265 | 0.000383 | 0.045155 | 0.001700 | 0.006116 | 0.014487 | 0.000353 | 0.000739 | 0.081433 | 0.205521 | 0.026081 | 0.040939 | 0.048872 | 0.005177 | 0.070880 | 0.019234 | 0.000472 | 0.000390 | 0.002892 | 0.132015 | 0.000082 | 0.031034 | 0.009341 |
| 3179 | the american claimant | novel | 0.026801 | 0.002196 | 0.000763 | 0.000391 | 0.000068 | 0.003472 | 0.127519 | 0.000068 | 0.015597 | 0.032417 | 0.043406 | 0.244817 | 0.000533 | 0.000282 | 0.010333 | 0.018602 | 0.000388 | 0.011194 | 0.017554 | 0.000242 | 0.002327 | 0.021107 | 0.009467 | 0.000307 | 0.003883 | 0.031241 | 0.061567 | 0.001216 | 0.112787 | 0.014258 | 0.001000 | 0.005397 | 0.013600 | 0.000068 | 0.000068 | 0.000646 | 0.159230 | 0.000068 | 0.002882 | 0.002240 |
| 3180 | a double barrelled detective story | stories | 0.012571 | 0.000720 | 0.024443 | 0.000851 | 0.000168 | 0.000168 | 0.004795 | 0.042020 | 0.104412 | 0.000168 | 0.049473 | 0.084866 | 0.000168 | 0.000168 | 0.000168 | 0.018022 | 0.004303 | 0.006709 | 0.007575 | 0.001992 | 0.000168 | 0.022925 | 0.007109 | 0.002166 | 0.002288 | 0.018674 | 0.019663 | 0.004586 | 0.064796 | 0.004707 | 0.014356 | 0.000168 | 0.003493 | 0.001786 | 0.000168 | 0.197933 | 0.238239 | 0.000168 | 0.032673 | 0.000168 |
| 3181 | the stolen white elephant | stories | 0.000058 | 0.000058 | 0.000058 | 0.000058 | 0.013198 | 0.000058 | 0.136176 | 0.238101 | 0.000058 | 0.009005 | 0.043408 | 0.036767 | 0.000058 | 0.000058 | 0.016199 | 0.008266 | 0.001488 | 0.000058 | 0.002486 | 0.000058 | 0.109247 | 0.007156 | 0.054921 | 0.000058 | 0.000058 | 0.000058 | 0.021863 | 0.000058 | 0.000058 | 0.000058 | 0.018731 | 0.000058 | 0.036174 | 0.000058 | 0.000058 | 0.000058 | 0.200166 | 0.000058 | 0.045367 | 0.000058 |
| 3182 | some rambling notes of an idle excursion | non-fiction | 0.000035 | 0.000035 | 0.039449 | 0.000035 | 0.000035 | 0.000035 | 0.026796 | 0.000035 | 0.039194 | 0.027782 | 0.111699 | 0.000035 | 0.000035 | 0.000035 | 0.165757 | 0.006092 | 0.000780 | 0.000035 | 0.021066 | 0.004057 | 0.000035 | 0.156948 | 0.051934 | 0.002891 | 0.000035 | 0.031544 | 0.000035 | 0.000035 | 0.060701 | 0.011073 | 0.015304 | 0.000035 | 0.000035 | 0.000035 | 0.000035 | 0.000035 | 0.170859 | 0.000035 | 0.009348 | 0.046016 |
| 3183 | the facts concerning the recent carnival of crime in connecticut | stories | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.011788 | 0.000024 | 0.000024 | 0.000024 | 0.039136 | 0.000024 | 0.000024 | 0.394230 | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.019680 | 0.000024 | 0.008843 | 0.048866 | 0.000024 | 0.041508 | 0.000024 | 0.000024 | 0.000024 | 0.046179 | 0.000024 | 0.000024 | 0.035618 | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.000024 | 0.353442 | 0.000024 | 0.000024 | 0.000024 |
| 3184 | alonzo fitz and other stories | stories | 0.038418 | 0.002598 | 0.006892 | 0.000085 | 0.000753 | 0.000174 | 0.058258 | 0.000085 | 0.066814 | 0.010390 | 0.021549 | 0.134402 | 0.036737 | 0.000085 | 0.043104 | 0.021617 | 0.000085 | 0.053471 | 0.021797 | 0.001741 | 0.001022 | 0.029354 | 0.072496 | 0.000085 | 0.000980 | 0.025432 | 0.002623 | 0.031469 | 0.012751 | 0.013760 | 0.022848 | 0.088366 | 0.044577 | 0.005427 | 0.000085 | 0.013525 | 0.054702 | 0.000085 | 0.047486 | 0.013873 |
| 3185 | those extraordinary twins | stories | 0.052447 | 0.000104 | 0.000104 | 0.001013 | 0.000104 | 0.000602 | 0.061827 | 0.108139 | 0.007767 | 0.022877 | 0.008452 | 0.060617 | 0.000104 | 0.000104 | 0.003103 | 0.011339 | 0.000104 | 0.000104 | 0.001995 | 0.063391 | 0.006443 | 0.012780 | 0.013175 | 0.000104 | 0.007771 | 0.033507 | 0.000104 | 0.000104 | 0.094463 | 0.000104 | 0.007131 | 0.001266 | 0.000104 | 0.000104 | 0.001524 | 0.014809 | 0.401891 | 0.000104 | 0.000104 | 0.000104 |
| 3186 | the mysterious stranger and other stories | stories | 0.000082 | 0.002489 | 0.017501 | 0.000082 | 0.000082 | 0.000082 | 0.062116 | 0.019195 | 0.000082 | 0.003853 | 0.015920 | 0.425984 | 0.003081 | 0.000082 | 0.000082 | 0.008051 | 0.000082 | 0.002627 | 0.006579 | 0.007117 | 0.071001 | 0.045147 | 0.023938 | 0.000082 | 0.000956 | 0.019543 | 0.001804 | 0.022686 | 0.007338 | 0.011545 | 0.031087 | 0.000082 | 0.000082 | 0.000082 | 0.001971 | 0.028343 | 0.115954 | 0.000082 | 0.015799 | 0.027306 |
| 3188 | mark twain speeches | non-fiction | 0.004950 | 0.014325 | 0.008165 | 0.001100 | 0.002569 | 0.000683 | 0.119498 | 0.014076 | 0.093055 | 0.024026 | 0.012342 | 0.016945 | 0.002724 | 0.003318 | 0.012666 | 0.084886 | 0.008665 | 0.034633 | 0.017168 | 0.004672 | 0.029103 | 0.015901 | 0.020633 | 0.012910 | 0.046060 | 0.051251 | 0.101806 | 0.019758 | 0.020809 | 0.043234 | 0.005857 | 0.049910 | 0.021948 | 0.014093 | 0.004888 | 0.001396 | 0.033006 | 0.001643 | 0.017110 | 0.008220 |
| 3189 | sketches new and old | stories | 0.020941 | 0.001853 | 0.008130 | 0.039235 | 0.003288 | 0.017860 | 0.075588 | 0.002178 | 0.069319 | 0.033062 | 0.066798 | 0.037695 | 0.009860 | 0.000163 | 0.019825 | 0.004438 | 0.015388 | 0.002078 | 0.006904 | 0.026091 | 0.011332 | 0.013961 | 0.039651 | 0.002518 | 0.014353 | 0.031992 | 0.005288 | 0.028629 | 0.014952 | 0.031585 | 0.006172 | 0.019295 | 0.127627 | 0.005038 | 0.006424 | 0.033561 | 0.114019 | 0.000163 | 0.024193 | 0.008551 |
| 3190 | 1601 conversation as it was by the social fireside in the time of the tudors | stories | 0.000079 | 0.003155 | 0.000079 | 0.000079 | 0.000079 | 0.000079 | 0.087715 | 0.000079 | 0.063229 | 0.076651 | 0.000079 | 0.000079 | 0.125918 | 0.000079 | 0.000079 | 0.089730 | 0.109314 | 0.282441 | 0.000079 | 0.014988 | 0.000079 | 0.000079 | 0.000079 | 0.017051 | 0.000079 | 0.000163 | 0.078532 | 0.008715 | 0.000079 | 0.002688 | 0.000079 | 0.000079 | 0.000079 | 0.019463 | 0.000079 | 0.000079 | 0.018350 | 0.000079 | 0.000079 | 0.000079 |
| 3191 | goldsmiths friend abroad again | stories | 0.000255 | 0.022364 | 0.000255 | 0.014304 | 0.000255 | 0.000255 | 0.079371 | 0.000255 | 0.010809 | 0.060000 | 0.002701 | 0.078223 | 0.105030 | 0.000255 | 0.056822 | 0.000255 | 0.015974 | 0.000255 | 0.000255 | 0.085680 | 0.000255 | 0.000255 | 0.034639 | 0.000255 | 0.000255 | 0.000255 | 0.000255 | 0.000255 | 0.099219 | 0.051933 | 0.003505 | 0.015172 | 0.052415 | 0.006176 | 0.005833 | 0.000255 | 0.142858 | 0.000255 | 0.032665 | 0.019710 |
| 3192 | the curious republic of gondour and other whimsical sketches | stories | 0.000296 | 0.000296 | 0.009651 | 0.000296 | 0.002430 | 0.000296 | 0.161618 | 0.001363 | 0.051993 | 0.105641 | 0.022714 | 0.000296 | 0.002674 | 0.000296 | 0.002148 | 0.103351 | 0.015349 | 0.032681 | 0.000296 | 0.013365 | 0.053480 | 0.001180 | 0.016685 | 0.003209 | 0.025152 | 0.031994 | 0.040011 | 0.019850 | 0.025922 | 0.024301 | 0.018148 | 0.023232 | 0.078003 | 0.009249 | 0.000296 | 0.019094 | 0.047777 | 0.000296 | 0.008233 | 0.026840 |
| 3199 | the letters of mark twain | non-fiction | 0.002300 | 0.000659 | 0.000636 | 0.002814 | 0.003943 | 0.000079 | 0.075459 | 0.000689 | 0.033944 | 0.018479 | 0.025956 | 0.022673 | 0.001168 | 0.000093 | 0.022974 | 0.483116 | 0.000812 | 0.002010 | 0.005172 | 0.001092 | 0.000976 | 0.010821 | 0.003137 | 0.032831 | 0.001524 | 0.007648 | 0.114935 | 0.006000 | 0.000996 | 0.015341 | 0.008968 | 0.003102 | 0.021377 | 0.000345 | 0.002219 | 0.000921 | 0.031525 | 0.000027 | 0.023199 | 0.010037 |
| 3250 | how to tell a story and other essays | non-fiction | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.198836 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.051706 | 0.000135 | 0.039667 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.131424 | 0.000135 | 0.000135 | 0.047424 | 0.096634 | 0.000135 | 0.068420 | 0.000135 | 0.000135 | 0.008469 | 0.000135 | 0.294398 | 0.004642 | 0.000135 | 0.054477 | 0.000135 | 0.000135 | 0.000135 |
| 3251 | the man that corrupted hadleyburg and other stories | stories | 0.024478 | 0.003901 | 0.001486 | 0.008960 | 0.000970 | 0.000053 | 0.233429 | 0.010471 | 0.058911 | 0.022358 | 0.018730 | 0.157106 | 0.015588 | 0.000286 | 0.073602 | 0.024788 | 0.000946 | 0.013637 | 0.002295 | 0.010917 | 0.006778 | 0.012980 | 0.025258 | 0.001791 | 0.002230 | 0.039481 | 0.000053 | 0.021884 | 0.014301 | 0.008054 | 0.049766 | 0.014034 | 0.009751 | 0.001928 | 0.000053 | 0.000053 | 0.063122 | 0.000053 | 0.016370 | 0.029151 |
| 19484 | editorial wild oats | stories | 0.000104 | 0.000104 | 0.000104 | 0.198686 | 0.000104 | 0.010594 | 0.000104 | 0.000104 | 0.051820 | 0.000104 | 0.033863 | 0.000104 | 0.000104 | 0.000104 | 0.000104 | 0.000104 | 0.000104 | 0.003507 | 0.000104 | 0.000104 | 0.000104 | 0.000104 | 0.157461 | 0.000104 | 0.022984 | 0.000104 | 0.000104 | 0.000104 | 0.062805 | 0.000104 | 0.000104 | 0.000104 | 0.342716 | 0.000104 | 0.000104 | 0.000104 | 0.089308 | 0.000104 | 0.000104 | 0.023240 |
| 19987 | chapters from my autobiography | non-fiction | 0.005820 | 0.002969 | 0.003783 | 0.003217 | 0.002543 | 0.002551 | 0.126032 | 0.006028 | 0.038181 | 0.056936 | 0.030406 | 0.084507 | 0.001667 | 0.001667 | 0.009996 | 0.082992 | 0.001667 | 0.003470 | 0.081412 | 0.004077 | 0.002040 | 0.022007 | 0.024856 | 0.002290 | 0.024062 | 0.017134 | 0.078411 | 0.002877 | 0.004565 | 0.039931 | 0.009506 | 0.012291 | 0.021209 | 0.002324 | 0.001667 | 0.005971 | 0.143275 | 0.001667 | 0.029715 | 0.004281 |
| 33077 | the treaty with china its provisions explained | non-fiction | 0.000020 | 0.006284 | 0.000020 | 0.196933 | 0.000020 | 0.000020 | 0.598452 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.004785 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.027865 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.000020 | 0.062127 | 0.102881 |
| 60900 | merry tales | stories | 0.000035 | 0.000035 | 0.017146 | 0.000035 | 0.000035 | 0.000035 | 0.062187 | 0.000035 | 0.000035 | 0.000035 | 0.054841 | 0.065214 | 0.000035 | 0.166582 | 0.018382 | 0.012510 | 0.000035 | 0.006638 | 0.000035 | 0.000035 | 0.000035 | 0.024405 | 0.212430 | 0.000035 | 0.000035 | 0.006384 | 0.000035 | 0.021596 | 0.112432 | 0.004155 | 0.044462 | 0.020401 | 0.002008 | 0.000035 | 0.000035 | 0.001502 | 0.145996 | 0.000035 | 0.000035 | 0.000035 |
| 61522 | the 1000000 bank note | stories | 0.000016 | 0.000016 | 0.000016 | 0.000016 | 0.000016 | 0.000016 | 0.084350 | 0.000611 | 0.000016 | 0.061145 | 0.040176 | 0.011274 | 0.001304 | 0.000016 | 0.080678 | 0.066950 | 0.000016 | 0.000870 | 0.001566 | 0.001182 | 0.000016 | 0.021515 | 0.034810 | 0.003878 | 0.000016 | 0.052863 | 0.014945 | 0.333170 | 0.024217 | 0.042316 | 0.002827 | 0.000106 | 0.029752 | 0.000016 | 0.000016 | 0.000016 | 0.076510 | 0.000016 | 0.012721 | 0.000016 |
| 62636 | to the person sitting in darkness | non-fiction | 0.000031 | 0.000031 | 0.000031 | 0.012267 | 0.000031 | 0.000031 | 0.691115 | 0.005603 | 0.000031 | 0.000031 | 0.000031 | 0.022943 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.096668 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.001781 | 0.168600 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 | 0.000031 |
| 62739 | king leopolds soliloquy | stories | 0.000193 | 0.020848 | 0.006304 | 0.000193 | 0.005271 | 0.000193 | 0.554934 | 0.000193 | 0.037668 | 0.006385 | 0.000193 | 0.042094 | 0.021567 | 0.000193 | 0.000193 | 0.000193 | 0.000193 | 0.069722 | 0.000193 | 0.031339 | 0.012609 | 0.025663 | 0.037078 | 0.022773 | 0.000193 | 0.000193 | 0.012710 | 0.002649 | 0.002112 | 0.010726 | 0.018417 | 0.010884 | 0.011534 | 0.000193 | 0.000193 | 0.006417 | 0.017245 | 0.000193 | 0.005345 | 0.004803 |
# most common topics by work type
book_mean_theta.groupby('type').mean().idxmax(axis = 1)
type non-fiction 6 novel 36 stories 36 dtype: int64
tm.TOPIC.loc[11]
phi_sum 30476.677087 theta_sum 59.764951 h 9.89 top_terms_rel earl sack song loving glanced confess gratitud... top_terms father happy child herself sat voice wife stra... label 11: earl sack song loving glanced confess grat... Name: 11, dtype: object
# table with most popular topic for each book --> rename new col created to topic_id and set index to topic_id for join below
max_topic = book_mean_theta.apply(lambda x: x.idxmax(), axis = 1).reset_index().rename({0: 'topic_id'}, axis = 1).set_index('topic_id')
# join with tm.TOPIC for words for each topic
max_topic = max_topic.join(tm.TOPIC).reset_index().set_index('book_id')
max_topic['top_five_terms'] = max_topic.apply(lambda x: x.top_terms_rel.split()[:5], axis = 1)
max_topic.sort_values('topic_id', ascending = False).drop('label', axis = 1).style.background_gradient(cmap='YlGnBu', subset = ['topic_id'])
| topic_id | title | type | phi_sum | theta_sum | h | top_terms_rel | top_terms | top_five_terms | |
|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||
| 3176 | 39 | the innocents abroad | non-fiction | 16583.089258 | 33.596436 | 9.140000 | pilgrims temple marble priests stone centuries sacred walls priest | stone church ancient marble walls pilgrims built streets temple | ['pilgrims', 'temple', 'marble', 'priests', 'stone'] |
| 19987 | 36 | chapters from my autobiography | non-fiction | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 3191 | 36 | goldsmiths friend abroad again | stories | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 3185 | 36 | those extraordinary twins | stories | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 3182 | 36 | some rambling notes of an idle excursion | non-fiction | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 3180 | 36 | a double barrelled detective story | stories | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 102 | 36 | the tragedy of puddnhead wilson | novel | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 86 | 36 | a connecticut yankee in king arthurs court | novel | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 74 | 36 | the adventures of tom sawyer | novel | 56674.817260 | 127.577716 | 10.130000 | awake stillness cheer whispered sprang wound crept hurried resolved | boys sat voice answer followed fire tried broke chance | ['awake', 'stillness', 'cheer', 'whispered', 'sprang'] |
| 3250 | 33 | how to tell a story and other essays | non-fiction | 2631.197347 | 8.253225 | 8.290000 | unto knight tale behold page fighting fought sons spell | unto tale knight story pass page women seven hair | ['unto', 'knight', 'tale', 'behold', 'page'] |
| 3189 | 32 | sketches new and old | stories | 11099.074186 | 26.910419 | 9.240000 | editor minister police journal prison paper papers ball press | paper public editor school write office papers following report | ['editor', 'minister', 'police', 'journal', 'prison'] |
| 19484 | 32 | editorial wild oats | stories | 11099.074186 | 26.910419 | 9.240000 | editor minister police journal prison paper papers ball press | paper public editor school write office papers following report | ['editor', 'minister', 'police', 'journal', 'prison'] |
| 2875 | 30 | personal recollections of joan of arc vol 2 | non-fiction | 16947.303924 | 43.172027 | 9.280000 | knights council sword wounded soldiers army soldier battle commanded | war battle army child march sent herself french soldiers | ['knights', 'council', 'sword', 'wounded', 'soldiers'] |
| 2874 | 30 | personal recollections of joan of arc vol 1 | non-fiction | 16947.303924 | 43.172027 | 9.280000 | knights council sword wounded soldiers army soldier battle commanded | war battle army child march sent herself french soldiers | ['knights', 'council', 'sword', 'wounded', 'soldiers'] |
| 93 | 28 | tom sawyer detective | novel | 19540.766202 | 51.483016 | 9.050000 | hes youre theyre nigger haint reckon coffin theyve theyd | hes reckon theres nigger youre wont duke youll youve | ['hes', 'youre', 'theyre', 'nigger', 'haint'] |
| 76 | 28 | the adventures of huckleberry finn | novel | 19540.766202 | 51.483016 | 9.050000 | hes youre theyre nigger haint reckon coffin theyve theyd | hes reckon theres nigger youre wont duke youll youve | ['hes', 'youre', 'theyre', 'nigger', 'haint'] |
| 61522 | 27 | the 1000000 bank note | stories | 13266.099095 | 16.544906 | 9.580000 | thy hero madam stars heaven victory hopes parents divine | thy heaven father thee voice soul alone woman thou | ['thy', 'hero', 'madam', 'stars', 'heaven'] |
| 3178 | 26 | the gilded age | novel | 19734.061371 | 41.714986 | 9.670000 | authors sincerely writer guest society hall replied affairs distinction | perhaps father wrote books society hall suppose quite woman | ['authors', 'sincerely', 'writer', 'guest', 'society'] |
| 60900 | 22 | merry tales | stories | 11200.302142 | 24.080291 | 9.410000 | stove fired gun negro killed guns armed shot murdered | killed shot kill war horse stove officer box road | ['stove', 'fired', 'gun', 'negro', 'killed'] |
| 1086 | 21 | a horses tale | novel | 15361.746982 | 34.968000 | 9.230000 | bird wings brush birds tree shape hasnt begins color | tree black bird makes comes goes big heaven looks | ['bird', 'wings', 'brush', 'birds', 'tree'] |
| 1044 | 21 | extract from captain stormfields visit to Heaven | stories | 15361.746982 | 34.968000 | 9.230000 | bird wings brush birds tree shape hasnt begins color | tree black bird makes comes goes big heaven looks | ['bird', 'wings', 'brush', 'birds', 'tree'] |
| 3190 | 17 | 1601 conversation as it was by the social fireside in the time of the tudors | stories | 4065.521650 | 12.361717 | 8.340000 | french marks literature reply printed funny teach artist article | french american literature art article conversation language marks point | ['french', 'marks', 'literature', 'reply', 'printed'] |
| 3199 | 15 | the letters of mark twain | non-fiction | 30783.182917 | 39.717141 | 9.620000 | letters lecture 12 yours 7 10 9 magazine 3 | letters write wrote written send story yours writing months | ['letters', 'lecture', '12', 'yours', '7'] |
| 1837 | 12 | the prince and the pauper | novel | 11305.222550 | 26.289940 | 9.130000 | hath tis thou thy thee prince lad royal mad | thou thy thee prince hath none tis ye royal | ['hath', 'tis', 'thou', 'thy', 'thee'] |
| 3186 | 11 | the mysterious stranger and other stories | stories | 30476.677087 | 59.764951 | 9.890000 | earl sack song loving glanced confess gratitude foolish sigh | father happy child herself sat voice wife stranger dog | ['earl', 'sack', 'song', 'loving', 'glanced'] |
| 3184 | 11 | alonzo fitz and other stories | stories | 30476.677087 | 59.764951 | 9.890000 | earl sack song loving glanced confess gratitude foolish sigh | father happy child herself sat voice wife stranger dog | ['earl', 'sack', 'song', 'loving', 'glanced'] |
| 3183 | 11 | the facts concerning the recent carnival of crime in connecticut | stories | 30476.677087 | 59.764951 | 9.890000 | earl sack song loving glanced confess gratitude foolish sigh | father happy child herself sat voice wife stranger dog | ['earl', 'sack', 'song', 'loving', 'glanced'] |
| 3179 | 11 | the american claimant | novel | 30476.677087 | 59.764951 | 9.890000 | earl sack song loving glanced confess gratitude foolish sigh | father happy child herself sat voice wife stranger dog | ['earl', 'sack', 'song', 'loving', 'glanced'] |
| 3177 | 10 | roughing it | novel | 40207.637360 | 84.184373 | 9.900000 | glacier steep summit rope scenery ice gloom mountain huge | distance mountain foot deep ground behind sun snow top | ['glacier', 'steep', 'summit', 'rope', 'scenery'] |
| 245 | 10 | life on the mississippi | non-fiction | 40207.637360 | 84.184373 | 9.900000 | glacier steep summit rope scenery ice gloom mountain huge | distance mountain foot deep ground behind sun snow top | ['glacier', 'steep', 'summit', 'rope', 'scenery'] |
| 119 | 10 | a tramp abroad | non-fiction | 40207.637360 | 84.184373 | 9.900000 | glacier steep summit rope scenery ice gloom mountain huge | distance mountain foot deep ground behind sun snow top | ['glacier', 'steep', 'summit', 'rope', 'scenery'] |
| 3171 | 8 | in defense of harriet shelley | non-fiction | 13636.500040 | 34.248539 | 9.430000 | husband baby married poetry marry wife disease chapter aged | wife child chapter husband friend married doctor girl woman | ['husband', 'baby', 'married', 'poetry', 'marry'] |
| 3181 | 7 | the stolen white elephant | stories | 1770.259201 | 8.391929 | 8.570000 | elephant dream telegram dreams angels suspected newspapers trunk brick | elephant dream dreams office telegram arrived chief court father | ['elephant', 'dream', 'telegram', 'dreams', 'angels'] |
| 70 | 6 | what is man | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 3188 | 6 | mark twain speeches | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 142 | 6 | the 30000 bequest and other stories | stories | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 2895 | 6 | following the equator | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 3172 | 6 | fenimore coopers literary offences | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 3173 | 6 | essays on paul bourget | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 62739 | 6 | king leopolds soliloquy | stories | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 3192 | 6 | the curious republic of gondour and other whimsical sketches | stories | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 3251 | 6 | the man that corrupted hadleyburg and other stories | stories | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 33077 | 6 | the treaty with china its provisions explained | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 62636 | 6 | to the person sitting in darkness | non-fiction | 44859.419532 | 88.110400 | 9.920000 | science reverence civilization government nations political religious british insane | government history law race state nation human power quite | ['science', 'reverence', 'civilization', 'government', 'nations'] |
| 91 | 2 | tom sawyer abroad | novel | 16895.707949 | 40.930780 | 9.070000 | raft canoe reckoned cave warnt scared knowed begun woods | warnt raft big boys mile begun run reckon minute | ['raft', 'canoe', 'reckoned', 'cave', 'warnt'] |
# set option so that columns not truncated
pd.set_option('display.max_colwidth', None)
works_df = max_topic.groupby('topic_id').agg({'topic_id': 'size', 'title': lambda x: ', '.join(x)}) \
.rename({'topic_id': 'count'}, axis = 1) \
.sort_values('count', ascending = False)
works_df['top_terms_rel'] = tm.TOPIC.top_terms_rel
works_df
| count | title | top_terms_rel | |
|---|---|---|---|
| topic_id | |||
| 6 | 11 | what is man, the 30000 bequest and other stories, following the equator, fenimore coopers literary offences, essays on paul bourget, mark twain speeches, the curious republic of gondour and other whimsical sketches, the man that corrupted hadleyburg and other stories, the treaty with china its provisions explained, to the person sitting in darkness, king leopolds soliloquy | science reverence civilization government nations political religious british insane |
| 36 | 8 | the adventures of tom sawyer, a connecticut yankee in king arthurs court, the tragedy of puddnhead wilson, a double barrelled detective story, some rambling notes of an idle excursion, those extraordinary twins, goldsmiths friend abroad again, chapters from my autobiography | awake stillness cheer whispered sprang wound crept hurried resolved |
| 11 | 4 | the american claimant, the facts concerning the recent carnival of crime in connecticut, alonzo fitz and other stories, the mysterious stranger and other stories | earl sack song loving glanced confess gratitude foolish sigh |
| 10 | 3 | a tramp abroad, life on the mississippi, roughing it | glacier steep summit rope scenery ice gloom mountain huge |
| 21 | 2 | extract from captain stormfields visit to Heaven, a horses tale | bird wings brush birds tree shape hasnt begins color |
| 32 | 2 | sketches new and old, editorial wild oats | editor minister police journal prison paper papers ball press |
| 30 | 2 | personal recollections of joan of arc vol 1, personal recollections of joan of arc vol 2 | knights council sword wounded soldiers army soldier battle commanded |
| 28 | 2 | the adventures of huckleberry finn, tom sawyer detective | hes youre theyre nigger haint reckon coffin theyve theyd |
| 26 | 1 | the gilded age | authors sincerely writer guest society hall replied affairs distinction |
| 33 | 1 | how to tell a story and other essays | unto knight tale behold page fighting fought sons spell |
| 27 | 1 | the 1000000 bank note | thy hero madam stars heaven victory hopes parents divine |
| 2 | 1 | tom sawyer abroad | raft canoe reckoned cave warnt scared knowed begun woods |
| 22 | 1 | merry tales | stove fired gun negro killed guns armed shot murdered |
| 17 | 1 | 1601 conversation as it was by the social fireside in the time of the tudors | french marks literature reply printed funny teach artist article |
| 15 | 1 | the letters of mark twain | letters lecture 12 yours 7 10 9 magazine 3 |
| 12 | 1 | the prince and the pauper | hath tis thou thy thee prince lad royal mad |
| 8 | 1 | in defense of harriet shelley | husband baby married poetry marry wife disease chapter aged |
| 7 | 1 | the stolen white elephant | elephant dream telegram dreams angels suspected newspapers trunk brick |
| 39 | 1 | the innocents abroad | pilgrims temple marble priests stone centuries sacred walls priest |
# reset width to default: https://pandas.pydata.org/docs/user_guide/options.html
pd.set_option('display.max_colwidth', 50)
w2v_params = dict(
min_count = 10,
workers = 1,
# vector_size = 246,
vector_size = 100,
window = 2
)
SENTS = CORPUS.groupby(OHCO[:-1]).term_str.apply(lambda x: x.tolist())
model = word2vec.Word2Vec(SENTS.values, **w2v_params)
W2V = pd.DataFrame(model.wv.get_normed_vectors(), index=model.wv.index_to_key)
W2V.index.name = 'term_str'
W2V = W2V.sort_index()
W2V.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||||||||||||
| 04 | -0.114225 | 0.095736 | 0.051546 | 0.055306 | 0.074904 | -0.105503 | 0.057929 | 0.241827 | -0.118943 | 0.032787 | ... | 0.034501 | -0.027179 | -0.073654 | -0.009954 | 0.247175 | 0.021245 | 0.052465 | -0.121067 | 0.104029 | 0.171155 |
| 08 | -0.101873 | 0.127570 | 0.037040 | 0.017632 | 0.039355 | -0.123453 | 0.033405 | 0.277762 | -0.086901 | 0.031751 | ... | 0.026811 | -0.028819 | -0.074951 | -0.046670 | 0.280164 | 0.053047 | 0.086309 | -0.135765 | 0.098677 | 0.155418 |
| 1 | -0.106604 | 0.047069 | 0.027273 | 0.010269 | 0.007156 | -0.112879 | 0.013426 | 0.232790 | 0.010953 | 0.030494 | ... | -0.028294 | -0.172603 | -0.058573 | -0.075017 | 0.314978 | -0.010819 | 0.028912 | -0.163268 | 0.086358 | 0.152443 |
| 10 | -0.083764 | -0.035320 | 0.101737 | -0.096142 | 0.090019 | -0.128170 | -0.072589 | 0.229303 | 0.026240 | 0.072759 | ... | -0.052871 | -0.107244 | -0.082400 | -0.028612 | 0.205192 | 0.023381 | 0.085210 | -0.192644 | 0.090751 | 0.125510 |
| 100 | -0.125445 | 0.081001 | 0.076093 | -0.188683 | 0.015055 | -0.188129 | -0.080429 | 0.243127 | 0.091042 | 0.098380 | ... | -0.045387 | -0.086018 | 0.004878 | -0.100249 | 0.231731 | 0.039334 | 0.032650 | -0.163369 | 0.084874 | 0.098967 |
5 rows × 100 columns
tsne_params = dict(
learning_rate = 200., #'auto' or [10.0, 1000.0]
perplexity = 40,
n_components = 2,
init = 'random', # 'pca'
n_iter = 2500,
random_state = 23
)
tsne_engine = TSNE(**tsne_params)
tsne_model = tsne_engine.fit_transform(W2V)
COORDS = pd.DataFrame(tsne_model, columns=['x','y'], index=W2V.index).join(VOCAB, how='left')[['x','y','n','dfidf','pos_group']]
COORDS['log_n'] = np.log(COORDS['n'])
COORDS
| x | y | n | dfidf | pos_group | log_n | |
|---|---|---|---|---|---|---|
| term_str | ||||||
| 04 | -49.950764 | 19.800415 | 10.0 | 18.227484 | NN | 2.302585 |
| 08 | -50.145111 | 19.059072 | 10.0 | 10.113742 | NN | 2.302585 |
| 1 | -57.705616 | 17.749544 | 331.0 | 428.368264 | CD | 5.802118 |
| 10 | -58.390575 | 15.621562 | 135.0 | 288.917371 | CD | 4.905275 |
| 100 | -55.285233 | 11.433803 | 62.0 | 181.458686 | CD | 4.127134 |
| ... | ... | ... | ... | ... | ... | ... |
| zest | -12.579935 | 5.808639 | 12.0 | 67.918141 | NN | 2.484907 |
| zu | -51.100506 | 52.666790 | 22.0 | 25.586339 | NN | 3.091042 |
| à | -56.147686 | 51.326492 | 44.0 | 51.144711 | NN | 3.784190 |
| était | -53.847881 | 50.613708 | 13.0 | 10.113742 | NN | 2.564949 |
| NaN | -14.547632 | 61.285461 | NaN | NaN | NaN | NaN |
13676 rows × 6 columns
px.scatter(COORDS.reset_index().sample(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size='dfidf',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
px.scatter(COORDS.reset_index().sort_values('dfidf', ascending=False).head(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size='dfidf',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
noun_COORDS = COORDS.loc[COORDS.pos_group == 'NN']
noun_COORDS
| x | y | n | dfidf | pos_group | log_n | |
|---|---|---|---|---|---|---|
| term_str | ||||||
| 04 | -49.950764 | 19.800415 | 10.0 | 18.227484 | NN | 2.302585 |
| 08 | -50.145111 | 19.059072 | 10.0 | 10.113742 | NN | 2.302585 |
| 350 | -53.410347 | 9.169445 | 24.0 | 67.918141 | NN | 3.178054 |
| 87 | -49.367786 | 18.569710 | 13.0 | 38.959070 | NN | 2.564949 |
| 89 | -51.060143 | 18.927137 | 15.0 | 38.959070 | NN | 2.708050 |
| ... | ... | ... | ... | ... | ... | ... |
| zermatt | 34.265057 | -67.250267 | 46.0 | 67.918141 | NN | 3.828641 |
| zest | -12.579935 | 5.808639 | 12.0 | 67.918141 | NN | 2.484907 |
| zu | -51.100506 | 52.666790 | 22.0 | 25.586339 | NN | 3.091042 |
| à | -56.147686 | 51.326492 | 44.0 | 51.144711 | NN | 3.784190 |
| était | -53.847881 | 50.613708 | 13.0 | 10.113742 | NN | 2.564949 |
7916 rows × 6 columns
px.scatter(noun_COORDS.reset_index().sample(1000),
'x', 'y',
text='term_str',
color='pos_group',
hover_name='term_str',
size = 'log_n',
height=1000).update_traces(
mode='markers+text',
textfont=dict(color='black', size=14, family='Arial'),
textposition='top center')
def complete_analogy(A, B, C, n=2):
try:
cols = ['term', 'sim']
return pd.DataFrame(model.wv.most_similar(positive=[B, C], negative=[A])[0:n], columns=cols)
except KeyError as e:
print('Error:', e)
return None
def get_most_similar(positive, negative=None):
return pd.DataFrame(model.wv.most_similar(positive, negative), columns=['term', 'sim'])
complete_analogy('man', 'boy', 'woman', 3)
| term | sim | |
|---|---|---|
| 0 | child | 0.776828 |
| 1 | girl | 0.767200 |
| 2 | lady | 0.722114 |
complete_analogy('girl', 'daughter', 'boy', 3)
| term | sim | |
|---|---|---|
| 0 | brother | 0.823369 |
| 1 | sister | 0.801899 |
| 2 | darling | 0.779816 |
complete_analogy('girl', 'sister', 'boy', 3)
| term | sim | |
|---|---|---|
| 0 | darling | 0.804392 |
| 1 | brother | 0.759961 |
| 2 | liege | 0.728171 |
complete_analogy('man', 'gentleman', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | lady | 0.878961 |
| 1 | girl | 0.817366 |
| 2 | fellow | 0.735923 |
| 3 | soldier | 0.704647 |
| 4 | farmer | 0.695843 |
complete_analogy('woman', 'lady', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | gentleman | 0.824686 |
| 1 | master | 0.699578 |
| 2 | citizen | 0.692256 |
| 3 | person | 0.685369 |
| 4 | stranger | 0.683805 |
complete_analogy('day', 'sun', 'night', 5)
| term | sim | |
|---|---|---|
| 0 | rain | 0.783103 |
| 1 | wind | 0.757743 |
| 2 | darkness | 0.744765 |
| 3 | storm | 0.722417 |
| 4 | curtain | 0.719862 |
complete_analogy('king', 'rich', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | slender | 0.711932 |
| 1 | graceful | 0.702313 |
| 2 | handsome | 0.695950 |
| 3 | splendid | 0.687642 |
| 4 | fat | 0.687409 |
complete_analogy('lord', 'rich', 'servant', 5)
| term | sim | |
|---|---|---|
| 0 | handsome | 0.720002 |
| 1 | graceful | 0.715430 |
| 2 | slender | 0.702366 |
| 3 | coarse | 0.686488 |
| 4 | dumb | 0.669372 |
complete_analogy('man', 'journey', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | voyage | 0.706367 |
| 1 | trip | 0.658042 |
| 2 | stretch | 0.630050 |
| 3 | spring | 0.614480 |
| 4 | flight | 0.605543 |
complete_analogy('woman', 'marriage', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | commission | 0.753536 |
| 1 | services | 0.752548 |
| 2 | birth | 0.733643 |
| 3 | powers | 0.730161 |
| 4 | departure | 0.726789 |
complete_analogy('man', 'property', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | affairs | 0.755897 |
| 1 | rights | 0.741495 |
| 2 | society | 0.733355 |
| 3 | sorrow | 0.725145 |
| 4 | religion | 0.721457 |
complete_analogy('man', 'fool', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | devil | 0.696721 |
| 1 | child | 0.651625 |
| 2 | lad | 0.635922 |
| 3 | girl | 0.624696 |
| 4 | beggar | 0.623952 |
complete_analogy('woman', 'fool', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | person | 0.644702 |
| 1 | hurry | 0.603647 |
| 2 | dog | 0.591042 |
| 3 | stranger | 0.585422 |
| 4 | chance | 0.574519 |
complete_analogy('man', 'wise', 'woman', 5)
| term | sim | |
|---|---|---|
| 0 | innocent | 0.697347 |
| 1 | foolish | 0.680637 |
| 2 | brave | 0.677902 |
| 3 | simple | 0.635330 |
| 4 | ignorant | 0.630998 |
complete_analogy('woman', 'wise', 'man', 5)
| term | sim | |
|---|---|---|
| 0 | worthy | 0.670954 |
| 1 | reasonable | 0.644145 |
| 2 | useful | 0.641109 |
| 3 | correct | 0.634468 |
| 4 | likely | 0.620507 |
get_most_similar('joy')
| term | sim | |
|---|---|---|
| 0 | delight | 0.801221 |
| 1 | admiration | 0.784409 |
| 2 | gratitude | 0.747991 |
| 3 | sorrow | 0.747686 |
| 4 | astonishment | 0.731498 |
| 5 | fright | 0.726452 |
| 6 | blessing | 0.722288 |
| 7 | spirit | 0.719269 |
| 8 | glory | 0.714442 |
| 9 | excitement | 0.705790 |
get_most_similar('man')
| term | sim | |
|---|---|---|
| 0 | person | 0.850920 |
| 1 | gentleman | 0.794088 |
| 2 | woman | 0.766792 |
| 3 | stranger | 0.740261 |
| 4 | dog | 0.718034 |
| 5 | fellow | 0.690010 |
| 6 | fool | 0.664921 |
| 7 | citizen | 0.647081 |
| 8 | girl | 0.645880 |
| 9 | slave | 0.635982 |
get_most_similar(positive=['man'], negative=['woman'])
| term | sim | |
|---|---|---|
| 0 | money | 0.344735 |
| 1 | business | 0.268716 |
| 2 | necessary | 0.265689 |
| 3 | government | 0.258725 |
| 4 | chance | 0.255060 |
| 5 | yourself | 0.251223 |
| 6 | public | 0.250115 |
| 7 | wrong | 0.246364 |
| 8 | going | 0.243778 |
| 9 | further | 0.243301 |
get_most_similar(positive='woman')
| term | sim | |
|---|---|---|
| 0 | girl | 0.866291 |
| 1 | gentleman | 0.831650 |
| 2 | lady | 0.820941 |
| 3 | fellow | 0.811571 |
| 4 | man | 0.766792 |
| 5 | soldier | 0.765872 |
| 6 | person | 0.760113 |
| 7 | creature | 0.756210 |
| 8 | slave | 0.755404 |
| 9 | child | 0.738329 |
get_most_similar(positive=['woman'], negative=['man'])
| term | sim | |
|---|---|---|
| 0 | young | 0.452947 |
| 1 | sweet | 0.428815 |
| 2 | friendless | 0.419484 |
| 3 | sister | 0.403715 |
| 4 | gray | 0.398251 |
| 5 | jane | 0.391606 |
| 6 | colored | 0.371316 |
| 7 | husband | 0.370074 |
| 8 | peasant | 0.368533 |
| 9 | old | 0.367854 |
get_most_similar(['man','woman'],['boy','girl'])
| term | sim | |
|---|---|---|
| 0 | free | 0.305291 |
| 1 | human | 0.292779 |
| 2 | neither | 0.269210 |
| 3 | nor | 0.239546 |
| 4 | honorable | 0.234591 |
| 5 | an | 0.230809 |
| 6 | lack | 0.229228 |
| 7 | reasonable | 0.228167 |
| 8 | utter | 0.226937 |
| 9 | independent | 0.226737 |
get_most_similar('knowledge')
| term | sim | |
|---|---|---|
| 0 | quality | 0.846008 |
| 1 | method | 0.830475 |
| 2 | genius | 0.826863 |
| 3 | system | 0.826707 |
| 4 | statement | 0.825331 |
| 5 | invention | 0.824492 |
| 6 | importance | 0.820199 |
| 7 | language | 0.815986 |
| 8 | crime | 0.814865 |
| 9 | wisdom | 0.811844 |
get_most_similar('rich')
| term | sim | |
|---|---|---|
| 0 | handsome | 0.765206 |
| 1 | graceful | 0.745025 |
| 2 | pure | 0.732829 |
| 3 | charming | 0.727296 |
| 4 | nice | 0.724187 |
| 5 | picturesque | 0.721800 |
| 6 | neat | 0.716759 |
| 7 | comely | 0.710631 |
| 8 | fine | 0.710626 |
| 9 | beautiful | 0.706341 |
get_most_similar('poor')
| term | sim | |
|---|---|---|
| 0 | brave | 0.669315 |
| 1 | young | 0.642734 |
| 2 | friendless | 0.611581 |
| 3 | devil | 0.585920 |
| 4 | sick | 0.568551 |
| 5 | weak | 0.567297 |
| 6 | gentle | 0.563504 |
| 7 | child | 0.557940 |
| 8 | girl | 0.547192 |
| 9 | innocent | 0.546639 |
get_most_similar('money')
| term | sim | |
|---|---|---|
| 0 | trouble | 0.772665 |
| 1 | food | 0.682620 |
| 2 | stock | 0.677449 |
| 3 | orders | 0.652256 |
| 4 | delay | 0.649710 |
| 5 | use | 0.641453 |
| 6 | chance | 0.635294 |
| 7 | wages | 0.626738 |
| 8 | profit | 0.626591 |
| 9 | purpose | 0.626240 |
drop and loc: https://www.geeksforgeeks.org/how-to-drop-one-or-multiple-columns-in-pandas-dataframe/